In [ ]:
from pymongo import MongoClient
from datetime import datetime, date, timedelta
from dateutil import parser
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib
from mpl_toolkits.basemap import Basemap
matplotlib.style.use('ggplot')
class TweetDatabase():
def __init__(self, isNew):
self.conn = MongoClient().data_science
self.new = isNew
def connect(self):
if self.new:
return self.conn.new_tweets
else:
return self.conn.old_tweets
class Config():
def __init__(self, isNew):
self.new = isNew
def period(self):
if self.new:
return (date(2014, 12, 15), date(2015, 3, 22))
else:
return (date(2014, 10, 27), date(2014, 12, 14))
In [ ]:
conf = Config(False)
db = TweetDatabase(False).connect()
tweets = db.find()
start, stop = conf.period()
period = pd.date_range(start, stop)
#Tweets Per Day
tpd = pd.Series(0, index = period)
for tweet in tweets:
create_datetime = tweet['created_at']
create_datetime = parser.parse(create_datetime, ignoretz = True)
create_date = create_datetime.date()
d = create_date.isoformat()
tpd[d] += 1
tpd
In [ ]:
tpd.describe()
In [ ]:
fig = plt.figure()
plt.bar(tpd.index, tpd, color='b')
plt.title('Tweets per day (old timetable)')
plt.xlabel('Date')
plt.ylabel('Amount of Tweets')
plt.ylim(ymax=1800)
fig.autofmt_xdate()
plt.savefig('./../../Paper/plots/old_tweets_per_day.png')
In [ ]:
db = TweetDatabase(False).connect()
geotweets = db.find({'coordinates': {'$ne': None}})
m = Basemap(resolution='i', projection='merc',
llcrnrlat=49.0, urcrnrlat=52.0, llcrnrlon=1., urcrnrlon=8.0, lat_ts=51.0)
m.drawcountries()
m.drawcoastlines()
m.fillcontinents()
for tweet in geotweets:
x, y = tweet['coordinates']['coordinates']
lat, lon = m(x, y)
m.plot(lat, lon, 'b.', alpha=0.5)
plt.title('Tweets with geolocation (old timetable)')
plt.savefig('./../../Paper/plots/old_tweets_geo.png')
print "Amount of tweets with geolocation:", geotweets.count()
In [ ]:
period = np.arange(24)
sr = pd.Series(0, index=period)
db = TweetDatabase(False).connect()
tweets = db.find()
for tweet in tweets:
create_datetime = tweet['created_at']
create_datetime = parser.parse(create_datetime, ignoretz = True)
weekday = create_datetime.weekday()
if weekday < 5:
create_hour = create_datetime.time().hour
sr[create_hour] += 1
sr
In [ ]:
sr.describe()
In [ ]:
plt.figure()
rolling = pd.rolling_mean(sr, 3, center=True)
ax_delays = sr.plot(style='--', color='b')
rolling.plot(color='b', ax=ax_delays, legend=0)
plt.xticks(np.arange(0, 24, 2))
plt.title('Tweets per hour (old timetable)')
plt.xlabel('Hour')
plt.ylabel('Tweets')
plt.ylim(ymax=2500)
plt.savefig('./../../Paper/plots/old_tweets_per_hour.png')
In [ ]:
conf = Config(True)
db = TweetDatabase(True).connect()
tweets = db.find()
start, stop = conf.period()
period = pd.date_range(start, stop)
#Tweets Per Day
tpd = pd.Series(0, index = period)
for tweet in tweets:
create_datetime = tweet['created_at']
create_datetime = parser.parse(create_datetime, ignoretz = True)
create_date = create_datetime.date()
d = create_date.isoformat()
tpd[d] += 1
tpd
In [ ]:
tpd.describe()
In [ ]:
fig = plt.figure()
plt.bar(tpd.index, tpd, color='r')
plt.title('Tweets per day (new timetable)')
plt.xlabel('Date')
plt.ylabel('Amount of Tweets')
plt.ylim(ymax=1800)
fig.autofmt_xdate()
plt.savefig('./../../Paper/plots/new_tweets_per_day.png')
In [ ]:
db = TweetDatabase(True).connect()
geotweets = db.find({'coordinates': {'$ne': None}})
m = Basemap(resolution='i', projection='merc',
llcrnrlat=49.0, urcrnrlat=52.0, llcrnrlon=1., urcrnrlon=8.0, lat_ts=51.0)
m.drawcountries()
m.drawcoastlines()
m.fillcontinents()
for tweet in geotweets:
x, y = tweet['coordinates']['coordinates']
lat, lon = m(x, y)
m.plot(lat, lon, 'r.', alpha=0.5)
plt.title('Tweets with geolocation (new timetable)')
plt.savefig('./../../Paper/plots/new_tweets_geo.png')
print "Amount of tweets with geolocation:", geotweets.count()
In [ ]:
period = np.arange(24)
sr = pd.Series(0, index=period)
db = TweetDatabase(True).connect()
tweets = db.find()
for tweet in tweets:
create_datetime = tweet['created_at']
create_datetime = parser.parse(create_datetime, ignoretz = True)
weekday = create_datetime.weekday()
if weekday < 5:
create_hour = create_datetime.time().hour
sr[create_hour] += 1
sr
In [ ]:
sr.describe()
In [ ]:
plt.figure()
rolling = pd.rolling_mean(sr, 3, center=True)
ax_delays = sr.plot(style='--', color='r')
rolling.plot(color='r', ax=ax_delays, legend=0)
plt.xticks(np.arange(0, 24, 2))
plt.title('Tweets per hour (new timetable)')
plt.xlabel('Hour')
plt.ylabel('Tweets')
plt.ylim(ymax=2500)
plt.savefig('./../../Paper/plots/new_tweets_per_hour.png')